library(bigMap)
# load aux. stuff
source('./mcsk15.R')

Load data

# first 50 principal components
X <- as.matrix(read.csv('./mcsk15_data.csv.gz'))

ptSNE

# ./mcsk15/start.R

library(bigMap)

X <- as.matrix(read.csv('./mcsk15_data.csv.gz'))

threads <- 40
ppx.list <- round(nrow(X) * c(.005, .01, .05, .10, .20, .30, .40, .50), 0)

# +++ start MPI cluster
mpi.cl <- bdm.mpi.start(threads)
if (is.null(mpi.cl)) return()

# +++ run
m.list <- lapply(ppx.list, function(ppx)
{
        # +++ compute betas
        m <- bdm.init(X, dSet.name = 'mck15', ppx = ppx, threads = threads, mpi.cl = mpi.cl)
        # +++ ptSNE
        m <- bdm.ptsne(NULL, m, lRate = NULL, theta = 0.0, threads = threads, mpi.cl = mpi.cl, layers = 2)
        # +++ EFR
        m.efr <- bdm.efr(NULL, list(m), ppx = ppx, iters = 100, threads = threads, mpi.cl = mpi.cl)
        # +++ EFR (ppx = 45)
        m.efr <- bdm.efr(NULL, list(m), ppx = 45, iters = 100, threads = threads, mpi.cl = mpi.cl)
        # +++ kNP
        m.efr <- lapply(m.efr, function(m) bdm.knp(NULL, m, threads = threads, mpi.cl = mpi.cl))
        # +++ hlC
        m.efr <- lapply(m.efr, function(m) bdm.hlCorr(NULL, m, threads = threads, mpi.cl = mpi.cl))
        #
        m.efr
})

save(m.list, file = './mcsk15_list.RData')

# +++ stop cluster
stopCluster(mpi.cl)

Submit job:

$ qsub -pe make 20 -l h_vmem=4G Rsckt ./mcsk15/start.R

Load output

# load ouput
load('./mcsk15_list.RData')
# pt-SNE embedding
m.list1 <- lapply(m.list, function(m.ppx) m.ppx[[1]])

Range of perplexities

sapply(m.list1, function(m) m$ppx$ppx)
## [1]   224   448  2240  4481  8962 13442 17923 22404

Embedding Cost/Size

nulL <- lapply(m.list1, function(m) bdm.cost(m))

Embedding

mcsk15.legend()

# labels
L <- mcsk15.lbls(l = 1)

pt-SNE

nulL <- lapply(m.list1, function(m) {
  m$lbls <- L
  bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
})

hl-Correlation

pt-SNE

hlTable <- sapply(m.list1, function(m) summary(m$hlC)[4])
hlTable <- matrix(hlTable, nrow = 1)
colnames(hlTable) <- sapply(m.list1, function(m) m$ppx$ppx)
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
  kable_styling(full_width = F)
hl-Correlation
224 448 2240 4481 8962 13442 17923 22404
<hlC> 0.1328081 0.0965539 0.1739388 0.1886429 0.1988989 0.314965 0.9076846 0.8988457

PCA (2 first components)

Note the HL-Correlation (~90%) for high perplexities (40%, 50% of data set size) and the similarity between the embedding and the PCA 2 first components plot;

# PCA plot
plot(X[, 1], X[, 2], pch = 15, cex = 0.3, col = MACOSKO_COLORS1[L])
# pt-SNE (ppx=17923)
m <- m.list1[[7]]
m$lbls <- L
bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)